{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "075aa2fc",
   "metadata": {},
   "outputs": [
    {
     "ename": "SSLError",
     "evalue": "HTTPSConnectionPool(host='tieba.baidu.com', port=443): Max retries exceeded with url: /p/7882177660?pn=1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1125)')))",
     "output_type": "error",
     "traceback": [
      "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[1;31mSSLCertVerificationError\u001b[0m                  Traceback (most recent call last)",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m    698\u001b[0m             \u001b[1;31m# Make the request on the httplib connection object.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 699\u001b[1;33m             httplib_response = self._make_request(\n\u001b[0m\u001b[0;32m    700\u001b[0m                 \u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_make_request\u001b[1;34m(self, conn, method, url, timeout, chunked, **httplib_request_kw)\u001b[0m\n\u001b[0;32m    381\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 382\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_validate_conn\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    383\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mSocketTimeout\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mBaseSSLError\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36m_validate_conn\u001b[1;34m(self, conn)\u001b[0m\n\u001b[0;32m   1009\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m\"sock\"\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# AppEngine might not have  `.sock`\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1010\u001b[1;33m             \u001b[0mconn\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mconnect\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1011\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connection.py\u001b[0m in \u001b[0;36mconnect\u001b[1;34m(self)\u001b[0m\n\u001b[0;32m    410\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 411\u001b[1;33m         self.sock = ssl_wrap_socket(\n\u001b[0m\u001b[0;32m    412\u001b[0m             \u001b[0msock\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mconn\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\util\\ssl_.py\u001b[0m in \u001b[0;36mssl_wrap_socket\u001b[1;34m(sock, keyfile, certfile, cert_reqs, ca_certs, server_hostname, ssl_version, ciphers, ssl_context, ca_cert_dir, key_password, ca_cert_data, tls_in_tls)\u001b[0m\n\u001b[0;32m    427\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0msend_sni\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 428\u001b[1;33m         ssl_sock = _ssl_wrap_socket_impl(\n\u001b[0m\u001b[0;32m    429\u001b[0m             \u001b[0msock\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcontext\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mtls_in_tls\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mserver_hostname\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\util\\ssl_.py\u001b[0m in \u001b[0;36m_ssl_wrap_socket_impl\u001b[1;34m(sock, ssl_context, tls_in_tls, server_hostname)\u001b[0m\n\u001b[0;32m    471\u001b[0m     \u001b[1;32mif\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 472\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mssl_context\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwrap_socket\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msock\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mserver_hostname\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mserver_hostname\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    473\u001b[0m     \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\ssl.py\u001b[0m in \u001b[0;36mwrap_socket\u001b[1;34m(self, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, session)\u001b[0m\n\u001b[0;32m    499\u001b[0m         \u001b[1;31m# ctx._wrap_socket()\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 500\u001b[1;33m         return self.sslsocket_class._create(\n\u001b[0m\u001b[0;32m    501\u001b[0m             \u001b[0msock\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msock\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\ssl.py\u001b[0m in \u001b[0;36m_create\u001b[1;34m(cls, sock, server_side, do_handshake_on_connect, suppress_ragged_eofs, server_hostname, context, session)\u001b[0m\n\u001b[0;32m   1039\u001b[0m                         \u001b[1;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"do_handshake_on_connect should not be specified for non-blocking sockets\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1040\u001b[1;33m                     \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_handshake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1041\u001b[0m             \u001b[1;32mexcept\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mOSError\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mValueError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\ssl.py\u001b[0m in \u001b[0;36mdo_handshake\u001b[1;34m(self, block)\u001b[0m\n\u001b[0;32m   1308\u001b[0m                 \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msettimeout\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;32mNone\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m-> 1309\u001b[1;33m             \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sslobj\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mdo_handshake\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m   1310\u001b[0m         \u001b[1;32mfinally\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mSSLCertVerificationError\u001b[0m: [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1125)",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mMaxRetryError\u001b[0m                             Traceback (most recent call last)",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\adapters.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m    438\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[1;32mnot\u001b[0m \u001b[0mchunked\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 439\u001b[1;33m                 resp = conn.urlopen(\n\u001b[0m\u001b[0;32m    440\u001b[0m                     \u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\connectionpool.py\u001b[0m in \u001b[0;36murlopen\u001b[1;34m(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)\u001b[0m\n\u001b[0;32m    754\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 755\u001b[1;33m             retries = retries.increment(\n\u001b[0m\u001b[0;32m    756\u001b[0m                 \u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merror\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_pool\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_stacktrace\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0msys\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mexc_info\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m[\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\urllib3\\util\\retry.py\u001b[0m in \u001b[0;36mincrement\u001b[1;34m(self, method, url, response, error, _pool, _stacktrace)\u001b[0m\n\u001b[0;32m    573\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0mnew_retry\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mis_exhausted\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 574\u001b[1;33m             \u001b[1;32mraise\u001b[0m \u001b[0mMaxRetryError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0m_pool\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0merror\u001b[0m \u001b[1;32mor\u001b[0m \u001b[0mResponseError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcause\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    575\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mMaxRetryError\u001b[0m: HTTPSConnectionPool(host='tieba.baidu.com', port=443): Max retries exceeded with url: /p/7882177660?pn=1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1125)')))",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001b[1;31mSSLError\u001b[0m                                  Traceback (most recent call last)",
      "\u001b[1;32m<ipython-input-1-0884c8263f1c>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m\u001b[0m\n\u001b[0;32m     32\u001b[0m         \u001b[0mcsvwriter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mwriterow\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'评论用户'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'评论时间'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;34m'评论内容'\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     33\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mpage\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mrange\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;36m16\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m  \u001b[1;31m# 爬取前15页的内容\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 34\u001b[1;33m             \u001b[0mmain\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpage\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mcsvwriter\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     35\u001b[0m             \u001b[0mtime\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msleep\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m2\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m<ipython-input-1-0884c8263f1c>\u001b[0m in \u001b[0;36mmain\u001b[1;34m(page, csvwriter)\u001b[0m\n\u001b[0;32m     11\u001b[0m         \u001b[1;34m'User-Agent'\u001b[0m\u001b[1;33m:\u001b[0m \u001b[1;34m'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     12\u001b[0m     }\n\u001b[1;32m---> 13\u001b[1;33m     \u001b[0mresp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mrequests\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mget\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mheaders\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mheaders\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     14\u001b[0m     \u001b[0mhtml\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mresp\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtext\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     15\u001b[0m     \u001b[1;31m# 评论内容\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mget\u001b[1;34m(url, params, **kwargs)\u001b[0m\n\u001b[0;32m     74\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     75\u001b[0m     \u001b[0mkwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msetdefault\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'allow_redirects'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;32mTrue\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 76\u001b[1;33m     \u001b[1;32mreturn\u001b[0m \u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m'get'\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mparams\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mparams\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     77\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     78\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\api.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(method, url, **kwargs)\u001b[0m\n\u001b[0;32m     59\u001b[0m     \u001b[1;31m# cases, and look like a memory leak in others.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     60\u001b[0m     \u001b[1;32mwith\u001b[0m \u001b[0msessions\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mSession\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 61\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0msession\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0murl\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0murl\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     62\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     63\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36mrequest\u001b[1;34m(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)\u001b[0m\n\u001b[0;32m    540\u001b[0m         }\n\u001b[0;32m    541\u001b[0m         \u001b[0msend_kwargs\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mupdate\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0msettings\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 542\u001b[1;33m         \u001b[0mresp\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mprep\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0msend_kwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    543\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    544\u001b[0m         \u001b[1;32mreturn\u001b[0m \u001b[0mresp\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\sessions.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, **kwargs)\u001b[0m\n\u001b[0;32m    653\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    654\u001b[0m         \u001b[1;31m# Send the request\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 655\u001b[1;33m         \u001b[0mr\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0madapter\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    656\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    657\u001b[0m         \u001b[1;31m# Total elapsed time of the request (approximately)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;32m~\\anaconda3\\lib\\site-packages\\requests\\adapters.py\u001b[0m in \u001b[0;36msend\u001b[1;34m(self, request, stream, timeout, verify, cert, proxies)\u001b[0m\n\u001b[0;32m    512\u001b[0m             \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mreason\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0m_SSLError\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    513\u001b[0m                 \u001b[1;31m# This branch is for urllib3 v1.22 and later.\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 514\u001b[1;33m                 \u001b[1;32mraise\u001b[0m \u001b[0mSSLError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    515\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    516\u001b[0m             \u001b[1;32mraise\u001b[0m \u001b[0mConnectionError\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0me\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mrequest\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mrequest\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
      "\u001b[1;31mSSLError\u001b[0m: HTTPSConnectionPool(host='tieba.baidu.com', port=443): Max retries exceeded with url: /p/7882177660?pn=1 (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate (_ssl.c:1125)')))"
     ]
    }
   ],
   "source": [
    "import csv\n",
    "import requests\n",
    "import re\n",
    "import time\n",
    "\n",
    "def main(page, csvwriter):\n",
    "    url = f'https://tieba.baidu.com/p/7882177660?pn={page}'\n",
    "    # urls = f'https://tieba.baidu.com/p/8137888365?pn={page}'\n",
    "\n",
    "    headers = {\n",
    "        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36'\n",
    "    }\n",
    "    resp = requests.get(url, headers=headers)\n",
    "    html = resp.text\n",
    "    # 评论内容\n",
    "    comments = re.findall('style=\"display:;\">                    (.*?)</div>', html)\n",
    "    # 评论用户\n",
    "    users = re.findall('class=\"p_author_name j_user_card\" href=\".*?\" target=\"_blank\">(.*?)</a>', html)\n",
    "    # 评论时间\n",
    "    comment_times = re.findall('楼</span><span class=\"tail-info\">(.*?)</span><div', html)\n",
    "    for u, c, t in zip(users, comments, comment_times):\n",
    "        # 筛选数据,过滤掉异常数据\n",
    "        if 'img' in c or 'div' in c or len(u) > 50:\n",
    "            continue\n",
    "        csvwriter.writerow((u, t, c))\n",
    "        print(u, t, c)\n",
    "    print(f'第{page}页爬取完毕')\n",
    "\n",
    "if __name__ == '__main__':\n",
    "    with open('./data/tboutput.csv', 'a', encoding='utf-8', newline='') as f:\n",
    "        csvwriter = csv.writer(f)\n",
    "        csvwriter.writerow(('评论用户', '评论时间', '评论内容'))\n",
    "        for page in range(1, 16):  # 爬取前15页的内容\n",
    "            main(page, csvwriter)\n",
    "            time.sleep(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "db7f091f",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "e33612c8",
   "metadata": {},
   "outputs": [],
   "source": [
    "tb_csv = pd.read_csv('./data/tboutput.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "f7c0ceb8",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>评论用户</th>\n",
       "      <th>评论时间</th>\n",
       "      <th>评论内容</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>神猫大侠</td>\n",
       "      <td>2022-06-17 13:36</td>\n",
       "      <td>我支持克莱因为他穿的是安踏球鞋</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>☆ost空之光</td>\n",
       "      <td>2022-06-17 13:37</td>\n",
       "      <td>就体育故事来说 克莱</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>牢西大佐</td>\n",
       "      <td>2022-06-17 22:42</td>\n",
       "      <td>鸡屎高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>寻歧</td>\n",
       "      <td>2022-06-17 22:53</td>\n",
       "      <td>你让克莱有哈登那哨子试试</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>库触</td>\n",
       "      <td>2022-06-17 23:12</td>\n",
       "      <td>哈登啊</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>决胜时刻</td>\n",
       "      <td>2022-06-26 20:58</td>\n",
       "      <td>懂了，三连冠麦肯吊打裤畜</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>BRshouter</td>\n",
       "      <td>2022-06-27 15:28</td>\n",
       "      <td>阿登不遵守交通法规，扣分</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>笑仁天皇</td>\n",
       "      <td>2022-06-27 20:37</td>\n",
       "      <td>哈登是球队老大得过mvp，汤神就是十个冠军也没法和哈登比，看看皮蓬的排名历史级别的老二排多少？</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>秀色动人</td>\n",
       "      <td>2022-06-29 20:29</td>\n",
       "      <td>我是真讨厌哈鸡屎打球，撅屁股碰瓷拉稀</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>115</th>\n",
       "      <td>评论用户</td>\n",
       "      <td>评论时间</td>\n",
       "      <td>评论内容</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>116 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "          评论用户              评论时间  \\\n",
       "0         神猫大侠  2022-06-17 13:36   \n",
       "1      ☆ost空之光  2022-06-17 13:37   \n",
       "2         牢西大佐  2022-06-17 22:42   \n",
       "3           寻歧  2022-06-17 22:53   \n",
       "4           库触  2022-06-17 23:12   \n",
       "..         ...               ...   \n",
       "111       决胜时刻  2022-06-26 20:58   \n",
       "112  BRshouter  2022-06-27 15:28   \n",
       "113       笑仁天皇  2022-06-27 20:37   \n",
       "114       秀色动人  2022-06-29 20:29   \n",
       "115       评论用户              评论时间   \n",
       "\n",
       "                                                评论内容  \n",
       "0                                    我支持克莱因为他穿的是安踏球鞋  \n",
       "1                                         就体育故事来说 克莱  \n",
       "2                                                鸡屎高  \n",
       "3                                       你让克莱有哈登那哨子试试  \n",
       "4                                                哈登啊  \n",
       "..                                               ...  \n",
       "111                                     懂了，三连冠麦肯吊打裤畜  \n",
       "112                                     阿登不遵守交通法规，扣分  \n",
       "113  哈登是球队老大得过mvp，汤神就是十个冠军也没法和哈登比，看看皮蓬的排名历史级别的老二排多少？  \n",
       "114                               我是真讨厌哈鸡屎打球，撅屁股碰瓷拉稀  \n",
       "115                                             评论内容  \n",
       "\n",
       "[116 rows x 3 columns]"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb_csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "15b3a6a2",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>评论用户</th>\n",
       "      <th>评论时间</th>\n",
       "      <th>评论内容</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>神猫大侠</td>\n",
       "      <td>2022-06-17 13:36</td>\n",
       "      <td>我支持克莱因为他穿的是安踏球鞋</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>☆ost空之光</td>\n",
       "      <td>2022-06-17 13:37</td>\n",
       "      <td>就体育故事来说 克莱</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>牢西大佐</td>\n",
       "      <td>2022-06-17 22:42</td>\n",
       "      <td>鸡屎高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>寻歧</td>\n",
       "      <td>2022-06-17 22:53</td>\n",
       "      <td>你让克莱有哈登那哨子试试</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>库触</td>\n",
       "      <td>2022-06-17 23:12</td>\n",
       "      <td>哈登啊</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      评论用户              评论时间             评论内容\n",
       "0     神猫大侠  2022-06-17 13:36  我支持克莱因为他穿的是安踏球鞋\n",
       "1  ☆ost空之光  2022-06-17 13:37       就体育故事来说 克莱\n",
       "2     牢西大佐  2022-06-17 22:42              鸡屎高\n",
       "3       寻歧  2022-06-17 22:53     你让克莱有哈登那哨子试试\n",
       "4       库触  2022-06-17 23:12              哈登啊"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb_csv.head() #查看前几行数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "81018148",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(116, 3)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb_csv.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "f22cd164",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Index(['评论用户', '评论时间', '评论内容'], dtype='object')"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb_csv.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "59721570",
   "metadata": {},
   "outputs": [],
   "source": [
    "tb_txt = pd.read_table('./data/thoutput.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "f8dc366e",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>评论用户,评论时间,评论内容</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>神猫大侠,2022-06-17 13:36,我支持克莱因为他穿的是安踏球鞋</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>☆ost空之光,2022-06-17 13:37,就体育故事来说 克莱</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>牢西大佐,2022-06-17 22:42,鸡屎高</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>寻歧,2022-06-17 22:53,你让克莱有哈登那哨子试试</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>库触,2022-06-17 23:12,哈登啊</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>110</th>\n",
       "      <td>詹者如皇😋,2022-06-24 15:30,看到这个标题，我就知道鸡屎密急了[看戏]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>111</th>\n",
       "      <td>决胜时刻,2022-06-26 20:58,懂了，三连冠麦肯吊打裤畜</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>112</th>\n",
       "      <td>BRshouter,2022-06-27 15:28,阿登不遵守交通法规，扣分</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>113</th>\n",
       "      <td>笑仁天皇,2022-06-27 20:37,哈登是球队老大得过mvp，汤神就是十个冠军也没法...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>114</th>\n",
       "      <td>秀色动人,2022-06-29 20:29,我是真讨厌哈鸡屎打球，撅屁股碰瓷拉稀</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>115 rows × 1 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        评论用户,评论时间,评论内容\n",
       "0                神猫大侠,2022-06-17 13:36,我支持克莱因为他穿的是安踏球鞋\n",
       "1                  ☆ost空之光,2022-06-17 13:37,就体育故事来说 克莱\n",
       "2                            牢西大佐,2022-06-17 22:42,鸡屎高\n",
       "3                     寻歧,2022-06-17 22:53,你让克莱有哈登那哨子试试\n",
       "4                              库触,2022-06-17 23:12,哈登啊\n",
       "..                                                 ...\n",
       "110        詹者如皇😋,2022-06-24 15:30,看到这个标题，我就知道鸡屎密急了[看戏]\n",
       "111                 决胜时刻,2022-06-26 20:58,懂了，三连冠麦肯吊打裤畜\n",
       "112            BRshouter,2022-06-27 15:28,阿登不遵守交通法规，扣分\n",
       "113  笑仁天皇,2022-06-27 20:37,哈登是球队老大得过mvp，汤神就是十个冠军也没法...\n",
       "114           秀色动人,2022-06-29 20:29,我是真讨厌哈鸡屎打球，撅屁股碰瓷拉稀\n",
       "\n",
       "[115 rows x 1 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tb_txt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8bae4222",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
